@@ -33,7 +33,7 @@ module Agents |
||
33 | 33 |
|
34 | 34 |
"@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`. |
35 | 35 |
|
36 |
- Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document. |
|
36 |
+ Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document unless a toplevel option `use_namespaces` is set to true. |
|
37 | 37 |
|
38 | 38 |
When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: |
39 | 39 |
|
@@ -302,9 +302,13 @@ module Agents |
||
302 | 302 |
end |
303 | 303 |
|
304 | 304 |
def use_namespaces? |
305 |
- interpolated['extract'].none? { |name, extraction_details| |
|
306 |
- extraction_details.key?('xpath') |
|
307 |
- } |
|
305 |
+ if value = interpolated.key?('use_namespaces') |
|
306 |
+ boolify(interpolated['use_namespaces']) |
|
307 |
+ else |
|
308 |
+ interpolated['extract'].none? { |name, extraction_details| |
|
309 |
+ extraction_details.key?('xpath') |
|
310 |
+ } |
|
311 |
+ end |
|
308 | 312 |
end |
309 | 313 |
|
310 | 314 |
def extract_each(&block) |
@@ -401,6 +401,28 @@ describe Agents::WebsiteAgent do |
||
401 | 401 |
expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30') |
402 | 402 |
end |
403 | 403 |
|
404 |
+ it "works with XPath with namespaces unstripped" do |
|
405 |
+ @checker.options['use_namespaces'] = 'true' |
|
406 |
+ @checker.save! |
|
407 |
+ expect { |
|
408 |
+ @checker.check |
|
409 |
+ }.to change { Event.count }.by(0) |
|
410 |
+ |
|
411 |
+ @checker.options['extract'] = { |
|
412 |
+ 'title' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => 'normalize-space(./xmlns:title)' }, |
|
413 |
+ 'url' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './xmlns:link[1]/@href' }, |
|
414 |
+ 'thumbnail' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './media:thumbnail/@url' }, |
|
415 |
+ } |
|
416 |
+ @checker.save! |
|
417 |
+ expect { |
|
418 |
+ @checker.check |
|
419 |
+ }.to change { Event.count }.by(20) |
|
420 |
+ event = Event.last |
|
421 |
+ expect(event.payload['title']).to eq('Shift to dev group') |
|
422 |
+ expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af') |
|
423 |
+ expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30') |
|
424 |
+ end |
|
425 |
+ |
|
404 | 426 |
it "works with CSS selectors" do |
405 | 427 |
@checker.options['extract'] = { |
406 | 428 |
'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' }, |
@@ -429,6 +451,23 @@ describe Agents::WebsiteAgent do |
||
429 | 451 |
expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af') |
430 | 452 |
expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30') |
431 | 453 |
end |
454 |
+ |
|
455 |
+ it "works with CSS selectors with namespaces stripped" do |
|
456 |
+ @checker.options['extract'] = { |
|
457 |
+ 'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' }, |
|
458 |
+ 'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' }, |
|
459 |
+ 'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' }, |
|
460 |
+ } |
|
461 |
+ @checker.options['use_namespaces'] = 'false' |
|
462 |
+ @checker.save! |
|
463 |
+ expect { |
|
464 |
+ @checker.check |
|
465 |
+ }.to change { Event.count }.by(20) |
|
466 |
+ event = Event.last |
|
467 |
+ expect(event.payload['title']).to eq('Shift to dev group') |
|
468 |
+ expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af') |
|
469 |
+ expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30') |
|
470 |
+ end |
|
432 | 471 |
end |
433 | 472 |
|
434 | 473 |
describe "JSON" do |